import urllib.request
import time
time.sleep(2)
import re

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
#获取新闻id
newsID = open('D://pengpai/pengpai_newsid.txt','w',encoding='utf-8') #保存路径自己修改
id = []
try:
    for i in range(1,26):  #26是因为澎湃新闻只有25页的内容
        url = 'https://www.thepaper.cn/load_index.jsp?nodeids=90069,&pageidx='+str(i)
        html = urllib.request.Request(url=url,headers=headers)
        htmlopen = urllib.request.urlopen(html).read().decode('utf-8')
        pat = '<a href="newsDetail_forward_.*?" class="tiptitleImg" data-id="(.*?)" target="_blank"> '
        newsid = re.compile(pat).findall(htmlopen)
        for t in newsid:
            id.append(t+'\n')
        print(len(id))
    for r in id:
        newsID.write(r)
    newsID.close()
except Exception as err:
    print(err)
